{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# AdventureWorks - Hard"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading spark-stubs, spark-hive\n",
      "Adding Hive conf dir /opt/hive/conf to classpath\n",
      "Creating SparkSession\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "SLF4J: No SLF4J providers were found.\n",
      "SLF4J: Defaulting to no-operation (NOP) logger implementation\n",
      "SLF4J: See https://www.slf4j.org/codes.html#noProviders for further details.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<a target=\"_blank\" href=\"http://jupyter:4040\">Spark UI</a>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36m$ivy.$                                  \n",
       "\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.log4j.{Level, Logger}\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.types._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.functions._\n",
       "\u001b[39m\n",
       "\u001b[32mimport \u001b[39m\u001b[36morg.apache.spark.sql.expressions.Window\n",
       "\n",
       "\u001b[39m\n",
       "\u001b[36mspark\u001b[39m: \u001b[32mSparkSession\u001b[39m = org.apache.spark.sql.SparkSession@25a4a4ea"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import $ivy.`org.apache.spark::spark-sql:3.4.0`\n",
    "\n",
    "import org.apache.log4j.{Level, Logger}\n",
    "Logger.getLogger(\"org\").setLevel(Level.OFF)\n",
    "\n",
    "import org.apache.spark._\n",
    "import org.apache.spark.sql._\n",
    "import org.apache.spark.sql.types._\n",
    "import org.apache.spark.sql.functions._\n",
    "import org.apache.spark.sql.expressions.Window\n",
    "\n",
    "val spark = {\n",
    "    NotebookSparkSession.builder()\n",
    "    .progress(false)\n",
    "    .appName(\"app14-3\")\n",
    "    // .master(\"spark://192.168.31.31:7077\")\n",
    "    .master(\"local[*]\")\n",
    "    .config(\"spark.sql.warehouse.dir\", \n",
    "            \"hdfs://192.168.31.31:9000/user/hive/warehouse\") \n",
    "    .config(\"spark.cores.max\", \"4\") \n",
    "    .config(\"spark.executor.instances\", \"1\") \n",
    "    .config(\"spark.executor.cores\", \"2\") \n",
    "    .config(\"spark.executor.memory\", \"10g\") \n",
    "    .config(\"spark.shuffle.service.enabled\", \"false\") \n",
    "    .config(\"spark.dynamicAllocation.enabled\", \"false\") \n",
    "    .config(\"spark.sql.catalogImplementation\", \"hive\")\n",
    "    .config(\"spark.sql.repl.eagerEval.enabled\", \"true\")\n",
    "    .config(\"spark.driver.allowMultipleContexts\", \"true\")\n",
    "    .getOrCreate()\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[32mimport \u001b[39m\u001b[36mspark.implicits._\n",
       "\u001b[39m\n",
       "defined \u001b[32mfunction\u001b[39m \u001b[36msc\u001b[39m\n",
       "\u001b[36mhiveCxt\u001b[39m: \u001b[32msql\u001b[39m.\u001b[32mhive\u001b[39m.\u001b[32mHiveContext\u001b[39m = org.apache.spark.sql.hive.HiveContext@341b663b"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import spark.implicits._\n",
    "def sc = spark.sparkContext\n",
    "val hiveCxt = new org.apache.spark.sql.hive.HiveContext(sc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "defined \u001b[32mclass\u001b[39m \u001b[36mRichDF\u001b[39m"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "// Credit to Aivean\n",
    "implicit class RichDF(val ds:DataFrame) {\n",
    "    def showHTML(limit: Int = 50, truncate: Int = 100) = {\n",
    "        import xml.Utility.escape\n",
    "        val data = ds.take(limit)\n",
    "        val header = ds.schema.fieldNames.toSeq        \n",
    "        val rows: Seq[Seq[String]] = data.map { row =>\n",
    "          row.toSeq.map {cell =>\n",
    "            val str = cell match {\n",
    "              case null => \"null\"\n",
    "              case binary: Array[Byte] => binary.map(\"%02X\".format(_)).mkString(\"[\", \" \", \"]\")\n",
    "              case array: Array[_] => array.mkString(\"[\", \", \", \"]\")\n",
    "              case seq: Seq[_] => seq.mkString(\"[\", \", \", \"]\")\n",
    "              case _ => cell.toString\n",
    "            }\n",
    "            if (truncate > 0 && str.length > truncate) {\n",
    "              // do not show ellipses for strings shorter than 4 characters.\n",
    "              if (truncate < 4) str.substring(0, truncate)\n",
    "              else str.substring(0, truncate - 3) + \"...\"\n",
    "            } else {\n",
    "              str\n",
    "            }\n",
    "          }: Seq[String]\n",
    "        }\n",
    "    publish.html(s\"\"\" <table>\n",
    "                <tr>\n",
    "                 ${header.map(h => s\"<th>${escape(h)}</th>\").mkString}\n",
    "                </tr>\n",
    "                ${rows.map {row =>\n",
    "                  s\"<tr>${row.map{c => s\"<td>${escape(c)}</td>\" }.mkString}</tr>\"\n",
    "                }.mkString}\n",
    "            </table>\n",
    "        \"\"\")\n",
    "    }\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\u001b[36mcust_aw\u001b[39m: \u001b[32mDataFrame\u001b[39m = [customerid: int, namestyle: string ... 11 more fields]\n",
       "\u001b[36mcust_addr\u001b[39m: \u001b[32mDataFrame\u001b[39m = [customerid: int, addressid: int ... 1 more field]\n",
       "\u001b[36maddr\u001b[39m: \u001b[32mDataFrame\u001b[39m = [addressid: int, addressline1: string ... 5 more fields]\n",
       "\u001b[36mproduct\u001b[39m: \u001b[32mDataFrame\u001b[39m = [productid: int, name: string ... 12 more fields]\n",
       "\u001b[36morder_det\u001b[39m: \u001b[32mDataFrame\u001b[39m = [salesorderid: int, salesorderdetailid: int ... 4 more fields]\n",
       "\u001b[36morder_head\u001b[39m: \u001b[32mDataFrame\u001b[39m = [salesorderid: int, revisionnumber: int ... 17 more fields]\n",
       "\u001b[36mprod_model\u001b[39m: \u001b[32mDataFrame\u001b[39m = [productmodelid: int, name: string ... 1 more field]\n",
       "\u001b[36mprod_model_prod\u001b[39m: \u001b[32mDataFrame\u001b[39m = [productmodelid: int, productdescriptionid: int ... 1 more field]\n",
       "\u001b[36mprod_desc\u001b[39m: \u001b[32mDataFrame\u001b[39m = [productdescriptionid: int, description: string]\n",
       "\u001b[36mprod_cat\u001b[39m: \u001b[32mDataFrame\u001b[39m = [productcategoryid: int, parentproductcategoryid: int ... 1 more field]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val cust_aw = hiveCxt.table(\"sqlzoo.CustomerAW\")\n",
    "val cust_addr = hiveCxt.table(\"sqlzoo.CustomerAddress\")\n",
    "val addr = hiveCxt.table(\"sqlzoo.Address\")\n",
    "val product = hiveCxt.table(\"sqlzoo.Product\")\n",
    "val order_det = hiveCxt.table(\"sqlzoo.SalesOrderDetail\")\n",
    "val order_head = hiveCxt.table(\"sqlzoo.SalesOrderHeader\")\n",
    "val prod_model = hiveCxt.table(\"sqlzoo.ProductModel\")\n",
    "val prod_model_prod = hiveCxt.table(\"sqlzoo.ProductModelProductDescription\")\n",
    "val prod_desc = hiveCxt.table(\"sqlzoo.ProductDescription\")\n",
    "val prod_cat = hiveCxt.table(\"sqlzoo.ProductCategory\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 11.\n",
    "**For every customer with a 'Main Office' in Dallas show AddressLine1 of the 'Main Office' and AddressLine1 of the 'Shipping' address - if there is no shipping address leave it blank. Use one row per customer.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>CompanyName</th><th>Main Office</th><th>Shipping</th>\n",
       "                </tr>\n",
       "                <tr><td>Elite Bikes</td><td>Po Box 8259024</td><td>9178 Jumping St.</td></tr><tr><td>Rental Bikes</td><td>99828 Routh Street, Suite 825</td><td></td></tr><tr><td>Third Bike Store</td><td>2500 North Stemmons Freeway</td><td></td></tr><tr><td>Town Industries</td><td>P.O. Box 6256916</td><td></td></tr><tr><td>Unsurpassed Bikes</td><td>Po Box 8035996</td><td></td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(cust_aw.join(cust_addr, \"CustomerID\")\n",
    " .join(cust_aw\n",
    "       .join(cust_addr.filter(cust_addr(\"AddressType\")===\"Main Office\"),\n",
    "             \"CustomerID\")\n",
    "       .join(addr.filter(addr(\"City\")===\"Dallas\"), \"AddressID\") \n",
    "       .select(\"CustomerID\")\n",
    "       .distinct(), \n",
    "       \"CustomerID\")\n",
    " .join(addr, \"AddressID\")\n",
    " .select(\"CompanyName\", \"AddressType\", \"AddressLine1\")\n",
    " .distinct()\n",
    " .groupBy(\"CompanyName\")\n",
    " .pivot(\"AddressType\")\n",
    " .agg(first(\"AddressLine1\"))\n",
    " .na.fill(\"\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 12.\n",
    "**For each order show the SalesOrderID and SubTotal calculated three ways:**\n",
    "\n",
    "- **A) From the SalesOrderHeader**\n",
    "- **B) Sum of OrderQty*UnitPrice**\n",
    "- **C) Sum of OrderQty*ListPrice**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>SalesOrderID</th><th>SubTotal</th><th>UnitPriceSum</th><th>ListPriceSum</th>\n",
       "                </tr>\n",
       "                <tr><td>71774</td><td>880.35</td><td>713.8</td><td>1189.66</td></tr><tr><td>71776</td><td>78.81</td><td>63.9</td><td>106.5</td></tr><tr><td>71780</td><td>38418.69</td><td>29922.81</td><td>56651.56</td></tr><tr><td>71782</td><td>39785.33</td><td>33319.68</td><td>55533.31</td></tr><tr><td>71783</td><td>83858.43</td><td>65682.74</td><td>121625.43</td></tr><tr><td>71784</td><td>108561.83</td><td>89868.88</td><td>151932.58</td></tr><tr><td>71796</td><td>57634.63</td><td>47848.02</td><td>79746.71</td></tr><tr><td>71797</td><td>78029.69</td><td>65122.79</td><td>108986.4</td></tr><tr><td>71815</td><td>1141.58</td><td>926.91</td><td>1544.86</td></tr><tr><td>71816</td><td>3398.17</td><td>2847.37</td><td>4745.68</td></tr><tr><td>71831</td><td>2016.34</td><td>1712.91</td><td>2854.91</td></tr><tr><td>71832</td><td>35775.21</td><td>28950.48</td><td>50559.01</td></tr><tr><td>71845</td><td>41622.05</td><td>34118.3</td><td>57768.21</td></tr><tr><td>71846</td><td>2453.76</td><td>1884.38</td><td>3592.65</td></tr><tr><td>71856</td><td>602.19</td><td>500.3</td><td>833.84</td></tr><tr><td>71858</td><td>13823.71</td><td>11528.8</td><td>19214.74</td></tr><tr><td>71863</td><td>3324.28</td><td>2777.05</td><td>4633.78</td></tr><tr><td>71867</td><td>1059.31</td><td>858.9</td><td>1431.5</td></tr><tr><td>71885</td><td>550.39</td><td>524.64</td><td>874.44</td></tr><tr><td>71895</td><td>246.74</td><td>221.24</td><td>368.76</td></tr><tr><td>71897</td><td>12685.89</td><td>10585.01</td><td>17641.75</td></tr><tr><td>71898</td><td>63980.99</td><td>53248.57</td><td>88747.82</td></tr><tr><td>71899</td><td>2415.67</td><td>1856.18</td><td>3545.67</td></tr><tr><td>71902</td><td>74058.81</td><td>59893.72</td><td>106151.57</td></tr><tr><td>71915</td><td>2137.23</td><td>1732.86</td><td>2888.15</td></tr><tr><td>71917</td><td>40.9</td><td>37.73</td><td>62.93</td></tr><tr><td>71920</td><td>2980.79</td><td>2527.08</td><td>4211.88</td></tr><tr><td>71923</td><td>106.54</td><td>96.03</td><td>166.81</td></tr><tr><td>71935</td><td>6634.3</td><td>5533.78</td><td>9229.27</td></tr><tr><td>71936</td><td>98278.69</td><td>79589.08</td><td>138124.87</td></tr><tr><td>71938</td><td>88812.86</td><td>5102.95</td><td>8504.95</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(order_head.select(\"SalesOrderID\", \"SubTotal\")\n",
    " .join(order_det\n",
    "       .withColumn(\"SubTotal\", col(\"OrderQty\") * col(\"UnitPrice\") * \n",
    "                   (lit(1)-col(\"UnitPriceDiscount\")))\n",
    "       .groupBy(\"SalesOrderID\")\n",
    "       .agg(sum(\"SubTotal\").alias(\"UnitPriceSum\"))\n",
    "       .withColumn(\"UnitPriceSum\", round($\"UnitPriceSum\", 2)),\n",
    "      \"SalesOrderID\")\n",
    " .join(order_det\n",
    "       .join(product, \"ProductID\")\n",
    "       .withColumn(\"SubTotal\", col(\"OrderQty\") * col(\"ListPrice\"))\n",
    "       .groupBy(\"SalesOrderID\")\n",
    "       .agg(sum(\"SubTotal\").alias(\"ListPriceSum\"))\n",
    "       .withColumn(\"ListPriceSum\", round($\"ListPriceSum\", 2)),\n",
    "      \"SalesOrderID\")\n",
    " .orderBy(\"SalesOrderID\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 13.\n",
    "**Show the best selling item by value.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>ProductID</th><th>Name</th><th>sum(SubTotal)</th>\n",
       "                </tr>\n",
       "                <tr><td>969</td><td>Touring-1000 Blue, 60</td><td>37191.44</td></tr><tr><td>783</td><td>Mountain-200 Black, 42</td><td>37178.73</td></tr><tr><td>782</td><td>Mountain-200 Black, 38</td><td>35801.74</td></tr><tr><td>976</td><td>Road-350-W Yellow, 48</td><td>33509.58</td></tr><tr><td>957</td><td>Touring-1000 Yellow, 60</td><td>23745.32</td></tr><tr><td>967</td><td>Touring-1000 Blue, 50</td><td>22887.04</td></tr><tr><td>780</td><td>Mountain-200 Silver, 42</td><td>20879.85</td></tr><tr><td>973</td><td>Road-350-W Yellow, 40</td><td>20411.8</td></tr><tr><td>784</td><td>Mountain-200 Black, 46</td><td>19277.86</td></tr><tr><td>781</td><td>Mountain-200 Silver, 46</td><td>18095.87</td></tr><tr><td>966</td><td>Touring-1000 Blue, 46</td><td>17165.28</td></tr><tr><td>954</td><td>Touring-1000 Yellow, 46</td><td>17165.28</td></tr><tr><td>974</td><td>Road-350-W Yellow, 42</td><td>15887.18</td></tr><tr><td>779</td><td>Mountain-200 Silver, 38</td><td>15311.89</td></tr><tr><td>793</td><td>Road-250 Black, 44</td><td>13194.09</td></tr><tr><td>972</td><td>Touring-2000 Blue, 54</td><td>13120.38</td></tr><tr><td>748</td><td>HL Mountain Frame - Silver, 38</td><td>13099.2</td></tr><tr><td>979</td><td>Touring-3000 Blue, 50</td><td>10244.43</td></tr><tr><td>743</td><td>HL Mountain Frame - Black, 42</td><td>9717.12</td></tr><tr><td>797</td><td>Road-550-W Yellow, 38</td><td>9412.06</td></tr><tr><td>965</td><td>Touring-3000 Yellow, 62</td><td>8744.85</td></tr><tr><td>968</td><td>Touring-1000 Blue, 54</td><td>8582.64</td></tr><tr><td>742</td><td>HL Mountain Frame - Silver, 46</td><td>8187.0</td></tr><tr><td>961</td><td>Touring-3000 Yellow, 44</td><td>8017.38</td></tr><tr><td>794</td><td>Road-250 Black, 48</td><td>7330.05</td></tr><tr><td>953</td><td>Touring-2000 Blue, 60</td><td>7289.1</td></tr><tr><td>885</td><td>HL Touring Frame - Yellow, 60</td><td>7228.2</td></tr><tr><td>955</td><td>Touring-1000 Yellow, 50</td><td>7152.2</td></tr><tr><td>958</td><td>Touring-3000 Blue, 54</td><td>6681.15</td></tr><tr><td>970</td><td>Touring-2000 Blue, 46</td><td>6560.19</td></tr><tr><td>971</td><td>Touring-2000 Blue, 50</td><td>6560.19</td></tr><tr><td>956</td><td>Touring-1000 Yellow, 54</td><td>5721.76</td></tr><tr><td>747</td><td>HL Mountain Frame - Black, 38</td><td>5668.32</td></tr><tr><td>998</td><td>Road-750 Black, 48</td><td>5183.84</td></tr><tr><td>838</td><td>HL Road Frame - Black, 44</td><td>5153.4</td></tr><tr><td>739</td><td>HL Mountain Frame - Silver, 42</td><td>4912.2</td></tr><tr><td>836</td><td>ML Road Frame-W - Yellow, 48</td><td>4282.8</td></tr><tr><td>892</td><td>HL Touring Frame - Blue, 54</td><td>4216.45</td></tr><tr><td>981</td><td>Mountain-400-W Silver, 40</td><td>4155.21</td></tr><tr><td>983</td><td>Mountain-400-W Silver, 46</td><td>4155.21</td></tr><tr><td>962</td><td>Touring-3000 Yellow, 50</td><td>4008.69</td></tr><tr><td>999</td><td>Road-750 Black, 52</td><td>3887.88</td></tr><tr><td>992</td><td>Mountain-500 Black, 48</td><td>3887.88</td></tr><tr><td>835</td><td>ML Road Frame-W - Yellow, 44</td><td>3569.0</td></tr><tr><td>977</td><td>Road-750 Black, 58</td><td>3563.89</td></tr><tr><td>889</td><td>HL Touring Frame - Yellow, 54</td><td>3011.75</td></tr><tr><td>864</td><td>Classic Vest, S</td><td>2968.93</td></tr><tr><td>795</td><td>Road-250 Black, 52</td><td>2932.02</td></tr><tr><td>951</td><td>HL Crankset</td><td>2915.88</td></tr><tr><td>963</td><td>Touring-3000 Yellow, 54</td><td>2672.46</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(order_det.join(product, \"ProductID\")\n",
    " .withColumn(\"SubTotal\", col(\"OrderQty\") * col(\"UnitPrice\"))\n",
    " .groupBy(\"ProductID\", \"Name\")\n",
    " .sum(\"SubTotal\")\n",
    " .withColumn(\"sum(SubTotal)\", round($\"sum(SubTotal)\", 2))\n",
    " .orderBy(col(\"Sum(SubTotal)\").desc)\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 14.\n",
    "**Show how many orders are in the following ranges (in $):**\n",
    "\n",
    "```\n",
    "    RANGE      Num Orders      Total Value\n",
    "    0-  99\n",
    "  100- 999\n",
    " 1000-9999\n",
    "10000-\n",
    "```"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>RANGE</th><th>count(SubTotal)</th><th>sum(SubTotal)</th>\n",
       "                </tr>\n",
       "                <tr><td>    0-  99</td><td>3</td><td>158.66</td></tr><tr><td>  100- 999</td><td>5</td><td>2386.21</td></tr><tr><td> 1000-9999</td><td>10</td><td>27561.43</td></tr><tr><td>10000-    </td><td>14</td><td>835326.81</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(order_head\n",
    " .withColumn(\"RANGE\", \n",
    "             when(col(\"SubTotal\").between(0, 99.99), \"    0-  99\")\n",
    "             .when(col(\"SubTotal\").between(100, 999.99), \"  100- 999\")\n",
    "             .when(col(\"SubTotal\").between(1000, 9999.99), \" 1000-9999\")\n",
    "             .otherwise(\"10000-    \"))\n",
    " .groupBy(\"RANGE\")\n",
    " .agg(count(\"SubTotal\"), sum(\"SubTotal\"))\n",
    " .withColumn(\"sum(SubTotal)\", round($\"sum(SubTotal)\", 2))\n",
    " .orderBy(\"RANGE\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 15.\n",
    "**Identify the three most important cities. Show the break down of top level product category against city.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       " <table>\n",
       "                <tr>\n",
       "                 <th>City</th><th>catg_name</th><th>sum(amount)</th>\n",
       "                </tr>\n",
       "                <tr><td>London</td><td>Bottom Brackets</td><td>388.73</td></tr><tr><td>London</td><td>Brakes</td><td>255.6</td></tr><tr><td>London</td><td>Chains</td><td>36.42</td></tr><tr><td>London</td><td>Cranksets</td><td>1773.81</td></tr><tr><td>London</td><td>Derailleurs</td><td>638.85</td></tr><tr><td>London</td><td>Gloves</td><td>88.14</td></tr><tr><td>London</td><td>Handlebars</td><td>292.63</td></tr><tr><td>London</td><td>Helmets</td><td>20.99</td></tr><tr><td>London</td><td>Mountain Bikes</td><td>50881.99</td></tr><tr><td>London</td><td>Mountain Frames</td><td>24018.8</td></tr><tr><td>London</td><td>Pedals</td><td>390.32</td></tr><tr><td>London</td><td>Road Bikes</td><td>5102.95</td></tr><tr><td>London</td><td>Saddles</td><td>284.43</td></tr><tr><td>London</td><td>Shorts</td><td>1072.14</td></tr><tr><td>Union City</td><td>Bike Racks</td><td>144.0</td></tr><tr><td>Union City</td><td>Bottles and Cages</td><td>29.9</td></tr><tr><td>Union City</td><td>Caps</td><td>57.31</td></tr><tr><td>Union City</td><td>Cleaners</td><td>38.16</td></tr><tr><td>Union City</td><td>Gloves</td><td>264.42</td></tr><tr><td>Union City</td><td>Handlebars</td><td>72.16</td></tr><tr><td>Union City</td><td>Helmets</td><td>721.69</td></tr><tr><td>Union City</td><td>Hydration Packs</td><td>296.91</td></tr><tr><td>Union City</td><td>Jerseys</td><td>1597.49</td></tr><tr><td>Union City</td><td>Pedals</td><td>490.73</td></tr><tr><td>Union City</td><td>Road Bikes</td><td>53478.76</td></tr><tr><td>Union City</td><td>Road Frames</td><td>10031.9</td></tr><tr><td>Union City</td><td>Socks</td><td>5.39</td></tr><tr><td>Union City</td><td>Tires and Tubes</td><td>8.22</td></tr><tr><td>Union City</td><td>Vests</td><td>904.95</td></tr><tr><td>Woolston</td><td>Bike Racks</td><td>432.0</td></tr><tr><td>Woolston</td><td>Bottles and Cages</td><td>31.79</td></tr><tr><td>Woolston</td><td>Caps</td><td>53.9</td></tr><tr><td>Woolston</td><td>Cleaners</td><td>38.16</td></tr><tr><td>Woolston</td><td>Gloves</td><td>161.59</td></tr><tr><td>Woolston</td><td>Helmets</td><td>495.36</td></tr><tr><td>Woolston</td><td>Hydration Packs</td><td>296.91</td></tr><tr><td>Woolston</td><td>Jerseys</td><td>1178.02</td></tr><tr><td>Woolston</td><td>Saddles</td><td>63.16</td></tr><tr><td>Woolston</td><td>Tires and Tubes</td><td>10.96</td></tr><tr><td>Woolston</td><td>Touring Bikes</td><td>77040.15</td></tr><tr><td>Woolston</td><td>Touring Frames</td><td>9430.95</td></tr><tr><td>Woolston</td><td>Vests</td><td>1108.19</td></tr>\n",
       "            </table>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "(addr\n",
    " .join(addr\n",
    "       .join(order_head, (addr(\"AddressID\")===order_head(\"ShipToAddressID\")))\n",
    "       .groupBy(\"City\")\n",
    "       .sum(\"SubTotal\")\n",
    "       .orderBy(col(\"sum(SubTotal)\").desc)\n",
    "       .limit(3),\n",
    "      \"City\")\n",
    " .join(order_head, (addr(\"AddressID\")===order_head(\"ShipToAddressID\")))\n",
    " .join(order_det, \"SalesOrderID\")\n",
    " .join(product, \"ProductID\")\n",
    " .join(prod_cat.withColumnRenamed(\"name\", \"catg_name\"), \n",
    "       \"ProductCategoryID\")\n",
    " .withColumn(\"amount\", col(\"OrderQty\") * col(\"UnitPrice\"))\n",
    " .groupBy(\"City\", \"catg_name\")\n",
    " .sum(\"amount\")\n",
    " .withColumn(\"sum(amount)\", round($\"sum(amount)\", 2))\n",
    " .orderBy(\"City\", \"catg_name\")\n",
    " .showHTML())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark.stop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Scala",
   "language": "scala",
   "name": "scala"
  },
  "language_info": {
   "codemirror_mode": "text/x-scala",
   "file_extension": ".sc",
   "mimetype": "text/x-scala",
   "name": "scala",
   "nbconvert_exporter": "script",
   "version": "2.12.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
